Package Version

In [1]:
#!pip install "plotly == 4.14.3"
#!pip install "matplotlib == 3.3.4"
#!pip install -U pipenv 
#!pip install "flair == 0.9"
#!pip install "gensim == 3.8.3" 
#!pip install "scikit-learn == 0.24.1" 

Imports

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from flair.models import TextClassifier
from flair.data import Sentence
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsTransformer
import re
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\mdh9011\anaconda\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.

Data Loading and Processing

In [3]:
#Load Dataframe
df = pd.read_csv("Data/BeerDataScienceProject.csv", encoding='latin-1')
In [4]:
#Convert timestamps to DateTime
df["review_time"] = pd.to_datetime(df["review_time"], unit="s")
In [5]:
#Check for null Values
df.isna().apply(lambda ser: ser.value_counts()).T
Out[5]:
False True
beer_ABV 508590.0 20280.0
beer_beerId 528870.0 NaN
beer_brewerId 528870.0 NaN
beer_name 528870.0 NaN
beer_style 528870.0 NaN
review_appearance 528870.0 NaN
review_palette 528870.0 NaN
review_overall 528870.0 NaN
review_taste 528870.0 NaN
review_profileName 528755.0 115.0
review_aroma 528870.0 NaN
review_text 528751.0 119.0
review_time 528870.0 NaN
In [6]:
#Remove all null datapoints
df = df.dropna()

Question 1: Rank top 3 Breweries which produce the strongest beers?

Different ways to answer this question:

  • Get the 3 Breweries that make the single strongest beers
  • Get the 3 Breweries that make the strongest on average beers
In [7]:
#Get top 3 Brewaries according to highest ABV
top3_abv = df.groupby("beer_brewerId").max().sort_values("beer_ABV", ascending=False).head(3)
In [8]:
#Get top 3 Brewaries according to highest ABV
top3_abv_avg = df.groupby("beer_brewerId").mean().sort_values("beer_ABV", ascending=False).head(3)
In [9]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(top3_abv_avg.index.astype(str)), y=top3_abv_avg["beer_ABV"].values.tolist(), marker_color="#96D294"))
fig.update_layout(
    title = "Top 3 Breweries which Produce Strongest Beers on Average",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)

fig.add_hline(y=df.groupby("beer_brewerId").mean().mean()["beer_ABV"], line_width=3, line_dash="dash", row=3, col="all", line_color="salmon",
              annotation_text="Average", 
              annotation_position="bottom right")


fig.update_xaxes(title="Brewery ID")
fig.update_yaxes(title="Beer ABV")
In [10]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(top3_abv_avg.index.astype(str)), y=top3_abv_avg["beer_ABV"].values.tolist(), marker_color="#96D294"))
fig.update_layout(
    title = "Top 3 Breweries which Produce Strongest Beers on Average",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)

fig.add_hline(y=df.groupby("beer_brewerId").mean().mean()["beer_ABV"], line_width=3, line_dash="dash", row=3, col="all", line_color="salmon",
              annotation_text="Average", 
              annotation_position="bottom right")


fig.update_xaxes(title="Brewery ID")
fig.update_yaxes(title="Beer ABV")

Question 2. Which year did beers enjoy the highest ratings?

Get the average overall rating per year and plot

In [11]:
#Sort by time and resample to a yearly period taking the mean
yearly_df = df.set_index("review_time").sort_index().resample("y").mean()
In [12]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=yearly_df.index, y=yearly_df["review_overall"], marker_color="#96D294"))

fig.add_annotation(x="2000-12-31", y=4.233333,
            text="2000 - 4.23",
            showarrow=True,
            arrowhead=1,
                  font=dict(
            family="Courier New, monospace",
            size=18,
            color="#ffffff",
            ),
        align="center",
        bgcolor="#ff7f0e",
        bordercolor="#c7c7c7",
        arrowwidth=2)

fig.update_layout(
    title = "Average Overall Beer Ratings Between 1998 and 2012",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Year")
fig.update_yaxes(title="Average Overall Rating")

Question 3. Based on the user’s ratings which factors are important among taste, aroma, appearance, and palette?

In [13]:
#Get correlations among all review categories
review_correlations = df[[i for i in df.columns if "review" in i]].corr().round(2)
In [14]:
#Process Column names of correlation
review_correlations.columns = ["Appearance", "Palette", "Overall", "Taste", "Aroma"]
review_correlations.index = ["Appearance", "Palette", "Overall", "Taste", "Aroma"]
In [15]:
#Create Fiure
fig, ax = plt.subplots(figsize=(10,10))
im = ax.imshow(review_correlations)

#Set figure Size
ax.set_xticks(np.arange(review_correlations.shape[1]))
ax.set_yticks(np.arange(review_correlations.shape[1]))

#label With Features
ax.set_xticklabels(review_correlations.columns)
ax.set_yticklabels(review_correlations.columns)
plt.yticks(fontsize=18,)
plt.xticks(fontsize=18,)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(review_correlations.shape[1]):
    for j in range(review_correlations.shape[1]):
        text = ax.text(j, i, review_correlations.values[i, j],
                       ha="center", va="center", color="w")

ax.set_title("Feature Correlations", fontsize=22)
fig.tight_layout()
plt.show()
In [16]:
overall_review = review_correlations["Overall"].drop("Overall").sort_values(ascending=False)
fig = go.Figure()
fig.add_trace(go.Bar(x=overall_review.index, y=overall_review, marker_color="salmon"))
fig.update_layout(
    title = "Correlation Between Overall Review and Other Factors",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Factors")
fig.update_yaxes(title="Correlation")
In [17]:
def mutate(x):
    return x + np.random.uniform(-0.1, 0.1)
In [18]:
#sample down data
sample_df = df.sample(5000)

#add random mutation
sample_df["review_aroma"] = sample_df["review_aroma"].apply(lambda x: mutate(x))
sample_df["review_overall"] = sample_df["review_overall"].apply(lambda x: mutate(x))

fig = go.Figure()
fig.add_trace(go.Scatter(x=sample_df["review_aroma"], y=sample_df["review_overall"], mode="markers"))
fig.update_layout(
    title = "Overall Rating Compared to Aroma Rating",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Aroma Rating")
fig.update_yaxes(title="Overall Rating")

Question 4. If you were to recommend 3 beers to your friends based on this data which ones will you recommend?

To solve this, will get the beers with the overall best review that are closest to the average ABV (dont want it to be too powerful)

In [19]:
#Sort beers closes to average abv
df["abv_avg_distance"] = (df["beer_ABV"] - df["beer_ABV"].mean()).abs().sort_values()
In [20]:
recommended_3 = df.sort_values(["review_overall", "abv_avg_distance"], ascending=[False, True]).iloc[:4]
1. Wee Beast (Cuillin Beast)
2. Turbo Shandy
3. IPA

Question 5. Which Beer style seems to be the favorite based on reviews written by users?

To solve this question, we follow two approaches:

  1. Get the average overall rating per Beer Style
  2. Get the sentiment value for the written texts, then get the highest average sentiment among the beer styles
In [21]:
#Approach 1
beer_style_df = df.groupby("beer_style").mean()
top_10_beer_style = beer_style_df["review_overall"].sort_values(ascending=False).head(10)
In [22]:
fig = go.Figure()
fig.add_trace(go.Bar(x=top_10_beer_style.index, y=top_10_beer_style, marker_color="salmon"))
fig.update_layout(
    title = "top 10 Beer Styles by Average Overall Ratings",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Style")
fig.update_yaxes(title="Overall Rating")

Approach 2, use flair (ready sentiment analysis) vs our own sentiment model

In [23]:
#Approach 2 
#User Flair for quick sentiment analysis
classifier = TextClassifier.load('en-sentiment')
2021-11-02 19:30:42,822 loading file C:\Users\mdh9011\.flair\models\sentiment-en-mix-distillbert_4.pt
In [24]:
#Sample dataframe for speed calculation
sample_df = df.sample(5000)
In [25]:
#Prepare list for sentiment values
review_sentiment = []
for review in sample_df["review_text"]:
    #Create Sentence of review
    sentence = Sentence(review)
    
    #Predict Sentiment of review
    classifier.predict(sentence)
    
    #Store value of sentiment, if negative store as - and if positive store as +
    sentiment = sentence.labels[0].score
    if sentence.labels[0].value == "NEGATIVE":
        sentiment = sentiment*-1
    
    review_sentiment.append(sentiment)
In [26]:
#Add sentiment value to dataframe
sample_df["review_sentiment"] = review_sentiment
In [27]:
#Approach 1
beer_sentiment_df = sample_df.groupby("beer_style").mean()
top_10_beer_sentiment = beer_sentiment_df["review_sentiment"].sort_values(ascending=False).head(10)
In [28]:
fig = go.Figure()
fig.add_trace(go.Bar(x=top_10_beer_sentiment.index, y=top_10_beer_sentiment, marker_color="salmon"))
fig.update_layout(
    title = "top 10 Beer Styles by Average Written Rating Sentiment",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Style")
fig.update_yaxes(title="Overall Rating")

Creating Our Own Sentiment Analysis Model

In [29]:
#Get Positive and Negative reviews
df["sentiment"] = df["review_overall"].apply(lambda x: "positive" if x > 2.5 else "negative")
In [30]:
sample_df = pd.concat([df[df["sentiment"] == "positive"].sample(500), df[df["sentiment"] == "negative"].sample(500)])
In [31]:
sample_df.shape
Out[31]:
(1000, 15)
In [32]:
#load model
model = KeyedVectors.load_word2vec_format('Models/GoogleNews-vectors-negative300.bin', binary=True)
In [33]:
def get_text_embeddings(text):
    empty_word = np.zeros(300)
    text = text.replace("\xa0", " ")
    text = text.replace("\n", " ")
    text = re.sub(r'[.,()!@#$?]', '', text)
    text = remove_stopwords(text)
    all_words = []
    for word in text.split():
        if len(all_words) <=120:
            try:
                all_words.append(model[word])
            except:
                all_words.append(empty_word)
                
    
    while len(all_words) <=120:
        all_words.append(empty_word)
        
    return np.array(all_words).flatten()
In [34]:
#Get embeddings
sample_df["review_text_embeddings"] = sample_df["review_text"].apply(lambda x: get_text_embeddings(x))
In [35]:
#Create SVM
sentiment_features = np.array(sample_df["review_text_embeddings"].values.tolist())

sentiment_targets = sample_df["sentiment"]

x_train, x_test, y_train, y_test = train_test_split(sample_df, sentiment_targets)
In [36]:
sentiment_model = SVC().fit(np.array(x_train["review_text_embeddings"].values.tolist()), y_train)
In [37]:
#custom model
predictions = sentiment_model.predict(np.array(x_test["review_text_embeddings"].values.tolist()))
In [38]:
accuracy_score(y_test, predictions)
Out[38]:
0.58
In [39]:
flair_predictions = []
for text in x_test["review_text"]:
    sentence = Sentence(review)
    
    #Predict Sentiment of review
    classifier.predict(sentence)
    if sentence.labels[0].value == "NEGATIVE":
        flair_predictions.append("negative")
    else:
        flair_predictions.append("positive")
In [40]:
accuracy_score(y_test, flair_predictions)
Out[40]:
0.484
In [41]:
fig = go.Figure()
fig.add_trace(go.Bar(x=["Custom", "Flair"], y=[accuracy_score(y_test, predictions), accuracy_score(y_test, flair_predictions)], marker_color="#96D294"))
fig.update_layout(
    title = "Custom Sentiment Analysis vs Flair",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)


fig.update_xaxes(title="Model",)
fig.update_yaxes(title="Accuracy",  tickformat="%")

Question 6. How does written review compare to overall review score for the beer styles?

To solve this question we need to compare the overall rating and the given sentiment of the text. In order to do this we:

  • Normalize the rating values between -1 to 1
  • Correlate the rating values with the sentiments
  • Compare positive/negative accuracy between positive and negative
In [42]:
def normalize(x):
    return (x - 2.5) / 2.5
In [43]:
df["review_overall_normalized"] = df["review_overall"].apply(lambda x: normalize(x))
In [44]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df["review_overall_normalized"], marker_color="salmon"))
fig.update_layout(
    title = "Normalized Overall Review Distribution",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Rating")
fig.update_yaxes(title="Number of Reviews")
In [73]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=review_sentiment, marker_color="salmon"))
fig.update_layout(
    title = "Sentiment Distribution",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)
fig.update_xaxes(title="Sentiment")
fig.update_yaxes(title="Number of Reviews")

Compare Accuracy of Predicting whether review will be positive or negative

In [74]:
fig = go.Figure()
fig.add_trace(go.Bar(x=["Custom", "Flair"], y=[accuracy_score(y_test, predictions), accuracy_score(y_test, flair_predictions)], marker_color="#96D294"))
fig.update_layout(
    title = "Custom Sentiment Analysis vs Flair",
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
    width=700,
    font=dict(size=14)
)


fig.update_xaxes(title="Model",)
fig.update_yaxes(title="Accuracy",  tickformat="%")

Question 7. How do we find similar beer drinkers by using written reviews only?

In order to find similar drinkers, we need to compare the reviews written by the different users by comparing the content within them. Use embeddings from before and build a KNN classifier to get the 5 most similar reviewers

In [75]:
embeddings = np.array(sample_df["review_text_embeddings"].values.tolist())
In [76]:
knn = KNeighborsTransformer().fit(embeddings)
In [77]:
neighbors = knn.kneighbors([sample_df[sample_df["review_profileName"] == "Brad007"]["review_text_embeddings"].iloc[0]], return_distance=False)
In [78]:
for index in neighbors[0][1:]:
    print(sample_df.iloc[index]["review_profileName"])
righthereisay
John
rastaman
TXHops

Getting Neighbors for user Brad007

1.Righthereisay

2. John

3. Rastaman

4. TXHops

In [ ]: